Question 1

(Q1a) Count the number of negative elements in a vector.

count_neg <- function(vector) {
  neg_numbers = 0 
  for (i in vector) {
    if (i < 0) {
      neg_numbers = neg_numbers + 1
    }
  }
  print(neg_numbers)
}
count_neg(c(1,2,3,4))
## [1] 0
count_neg(c(1,2,-3,4))
## [1] 1
count_neg(c(-1,2,-3,4))
## [1] 2

(Q1b) Calculate the length of a vector ||x||.

vlen <- function(x) {
  if (length(x)) {
    print(sqrt(sum(x^2)))
  }
}
vlen(1)
## [1] 1
vlen(2)
## [1] 2
vlen(c(3,4))
## [1] 5

(Q1c) Your function should take a single input which is the number of bottles to start with.

beer <- function(beer) {
  for (i in beer:1) {
    if (i > 2) {
      cat(i,"bottles of beer on the wall,", i, "bottles of beer.", "\n")
      cat("Take one down and pass it around,", (i-1), "bottles of beer on the wall.","\n\n")
    } else if (i == 2) {
      cat("2 bottles of beer on the wall, 2 bottles of beer.", "\n")
      cat("Take one down and pass it around, 1 bottle of beer on the wall.", "\n\n")
    } else if (i == 1) {
      cat("1 bottle of beer on the wall, 1 bottle of beer.", "\n")
      cat("Take one down and pass it around, no more bottles of beer on the wall.", "\n\n")
      cat("No more bottles of beer on the wall, no more bottles of beer.", "\n")
      cat("Go to the store and buy some more, 99 bottles of beer on the wall.", "\n\n")
    }
  }
}
beer(5)
## 5 bottles of beer on the wall, 5 bottles of beer. 
## Take one down and pass it around, 4 bottles of beer on the wall. 
## 
## 4 bottles of beer on the wall, 4 bottles of beer. 
## Take one down and pass it around, 3 bottles of beer on the wall. 
## 
## 3 bottles of beer on the wall, 3 bottles of beer. 
## Take one down and pass it around, 2 bottles of beer on the wall. 
## 
## 2 bottles of beer on the wall, 2 bottles of beer. 
## Take one down and pass it around, 1 bottle of beer on the wall. 
## 
## 1 bottle of beer on the wall, 1 bottle of beer. 
## Take one down and pass it around, no more bottles of beer on the wall. 
## 
## No more bottles of beer on the wall, no more bottles of beer. 
## Go to the store and buy some more, 99 bottles of beer on the wall.

(Q1d) Compute the correlation of two vectors without using the built-in mean, sd, cor, var functions.

my_cor <- function(x,y) {
  n = length(x)
  numerator <- n*sum(x*y)-sum(x)*sum(y)
  denominator <- sqrt((n*sum(x^2)-sum(x)^2)*(n*sum(y^2)-sum(y)^2))
  print(numerator/denominator)
}
my_cor(1:5, 1:5)
## [1] 1
my_cor(3:1, 1:3)
## [1] -1
my_cor(seq(-5, 5), seq(-5, 5)^2)
## [1] 0

Question 2

data(tli)
glimpse(tli)
## Rows: 100
## Columns: 5
## $ grade    <int> 6, 7, 5, 3, 8, 5, 8, 4, 6, 7, 3, 6, 8, 5, 8, 6, 4, 3, 3, 5, 6…
## $ sex      <fct> M, M, F, M, M, M, F, M, M, M, M, F, M, M, F, F, M, F, M, M, M…
## $ disadvg  <fct> YES, NO, YES, YES, YES, NO, YES, YES, NO, YES, NO, NO, NO, NO…
## $ ethnicty <fct> HISPANIC, BLACK, HISPANIC, HISPANIC, WHITE, BLACK, HISPANIC, …
## $ tlimth   <int> 43, 88, 34, 65, 75, 74, 72, 79, 88, 87, 79, 84, 90, 73, 72, 8…

(Q2a) In which grade did students do best on the exam?

tli |>
  select(grade, tlimth) |>
  group_by(grade) |>
  summarise(avg_score = mean(tlimth)) |>
  slice_max(avg_score)
## # A tibble: 1 × 2
##   grade avg_score
##   <int>     <dbl>
## 1     6      82.3

Grade 6 has the highest average exam scores with a 82.3.

(Q2b) How large is the gap average test scores between students from economically disadvantaged families and students who do not report coming from economically disadvantaged families?

tli |>
  select(disadvg, tlimth) |>
  group_by(disadvg) |>
  summarize(avg_score = mean(tlimth)) |>
  mutate(diff(avg_score))
## # A tibble: 2 × 3
##   disadvg avg_score `diff(avg_score)`
##   <fct>       <dbl>             <dbl>
## 1 NO           78.1             -4.75
## 2 YES          73.3             -4.75

(Q2c) In what grade is the difference in average test performance by gender the largest?

tli |>
  select(grade, sex, tlimth) |>
  group_by(grade, sex) |>
  summarise(avg_score = mean(tlimth)) |>
  mutate(difference = diff(avg_score))
## `summarise()` has grouped output by 'grade'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups:   grade [6]
##    grade sex   avg_score difference
##    <int> <fct>     <dbl>      <dbl>
##  1     3 F          67         4   
##  2     3 M          71         4   
##  3     4 F          74.2       2.28
##  4     4 M          76.5       2.28
##  5     5 F          69.2      12.6 
##  6     5 M          81.9      12.6 
##  7     6 F          84.6      -4.55
##  8     6 M          80.1      -4.55
##  9     7 F          78.3       5.58
## 10     7 M          83.9       5.58
## 11     8 F          73.6      -4.43
## 12     8 M          69.1      -4.43

Grade 5 has the largest difference in average test performance by gender.

(Q2d) Is the gender difference larger for students from economically disadvantaged backgrounds?

tli |>
  select(sex, disadvg) |> 
  group_by(disadvg) |>
  count(sex)
## # A tibble: 4 × 3
## # Groups:   disadvg [2]
##   disadvg sex       n
##   <fct>   <fct> <int>
## 1 NO      F        36
## 2 NO      M        29
## 3 YES     F        15
## 4 YES     M        20

The gender difference is larger for students not from economically disadvantaged backgrounds.

(Q2e) Which ethnic group exhibits the smallest gender gap? Do you trust this result? Why or why not?

tli |>
  select(sex, ethnicty) |>
  group_by(ethnicty) |>
  count(sex)
## # A tibble: 7 × 3
## # Groups:   ethnicty [4]
##   ethnicty sex       n
##   <fct>    <fct> <int>
## 1 BLACK    F        11
## 2 BLACK    M        12
## 3 HISPANIC F         8
## 4 HISPANIC M        12
## 5 OTHER    F         2
## 6 WHITE    F        30
## 7 WHITE    M        25

Black ethnic group exhibits the smallest gender gap. I do not trust the results because the sample sizes for all ethnic groups ranges.

Would you trust any conclusions from this data?

tli |>
  count(sex)
##   sex  n
## 1   F 51
## 2   M 49
tli |>
  count(grade)
##   grade  n
## 1     3 15
## 2     4 15
## 3     5 15
## 4     6 23
## 5     7 18
## 6     8 14
tli |>
  count(ethnicty)
##   ethnicty  n
## 1    BLACK 23
## 2 HISPANIC 20
## 3    OTHER  2
## 4    WHITE 55
tli |>
  count(disadvg)
##   disadvg  n
## 1      NO 65
## 2     YES 35

I would not trust any conclusions because the number of students in each grade ranges, there is a disparity in the number of students per ethnic background, and there is a disproportion among those considered coming from economically disadvantaged backgrounds and not.

Question 3

data(diamonds)
glimpse(diamonds)
## Rows: 53,940
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…

(Q3a) Make a scatter plot of price vs carat and facet it by cut.

ggplot(data = diamonds, aes(x=carat, y=price)) + 
  geom_point() + 
  facet_wrap(~cut) + 
  theme_bw() 

(Q3b) Use geom smooth to see how the price-carat relationship changes by color.

ggplot(data = diamonds, aes(x=carat, y=price)) + 
  geom_point() +
  geom_smooth(aes(color=color), se=FALSE) +
  facet_wrap(~cut) + 
  theme_bw() +
  theme(legend.position = "bottom")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

(Q3c) Create a frequency polygon plot of price, broken out by different diamond cuts.

ggplot(diamonds, aes(x=price, color=cut)) + 
  geom_freqpoly() + 
  facet_wrap(~cut) +
  theme_bw() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

(Q3d) Create a scatter plot of color by clarity. Why is this plot not useful? Make a better plot to visualize this relationship using the ggmosaic package.

ggplot(diamonds, aes(x=color, y=clarity)) + geom_point() 

This is not a useful plot as there is no clear pattern being depicted. The color and clarity components both represent categorical variables.

ggplot(diamonds) + 
  geom_mosaic(aes(x=product(color), fill=clarity)) + 
  theme_bw() 
## Warning: The `scale_name` argument of `continuous_scale()` is deprecated as of ggplot2
## 3.5.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `unite_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `unite()` instead.
## ℹ The deprecated feature was likely used in the ggmosaic package.
##   Please report the issue at <https://github.com/haleyjeppson/ggmosaic>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Question 4

data(gapminder)
glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …

(Q4b) Create a scatter plot of the relationship between GDP and Life Expectancy in the year 1952. Color points by continent and use the size aesthetic to represent population.

year_1952 <- subset(gapminder, year == 1952)
ggplot(year_1952, aes(x=gdpPercap, y=lifeExp, color=continent, size = log(pop))) + 
  geom_point() 

(Q4c) There is an outlier country in this data with very high GDP. What is it? Identify and remove it.

year_1952new <- subset(year_1952, country != "Kuwait")
ggplot(year_1952new, aes(x=gdpPercap, y=lifeExp, color=continent, size = log(pop))) + 
  geom_point() 

(Q4d) Using the transition time function, make this an animated plot showing how this data changes over time.

gapminder_new <- subset(gapminder, country != "Kuwait")
ggplot(gapminder_new, aes(x=gdpPercap, y=lifeExp, color=continent, size = log(pop))) + 
  geom_point() + 
  transition_time(year)

(Q4e) Using the theme machinery, labels, etc. make this a ‘publication ready’ plot. Note that you can use {frame time} in the title to get a dynamically changing year.

ggplot(gapminder_new, aes(x=gdpPercap, y=lifeExp, color=continent, size = (log(pop)))) + 
  geom_point() + 
  transition_time(year) +
  theme_bw() +
  ggtitle("Life Expectancy VS GDP ({frame_time})") +
  ylab("Life Expectancy") +
  xlab("GDP Per Capita") +
  theme(legend.position = "bottom") +
  guides(size="none") 

(Q4f) Use the country colors data from the gapminder plot to color the points using Dr. Rosling’s preferred color scheme.

ggplot(gapminder_new, aes(x=gdpPercap, y=lifeExp, color=continent, size = (log(pop)))) + 
  geom_point() + 
  transition_time(year) +
  theme_bw() +
  ggtitle("Life Expectancy VS GDP ({frame_time})") +
  ylab("Life Expectancy") +
  xlab("GDP Per Capita") +
  theme(legend.position = "bottom") +
  guides(size="none") +
  scale_color_manual(values=c("Europe"="brown","Asia"="red","Africa"="blue","Americas"="yellow"))

Question 5

(Q5a) Download the New York City Council Districts (Clipped to Shoreline) shapefiles from https://www.nyc.gov/site/planning/data-maps/open-data/districts-download-metadata.page. Unzip the download and read the shape file, nycc.shp, using the sf::read sf function. Use ggplot2::geom sf to create a basic map of NYC’s city council districts. This should just be a plot of the outlines of each district (recognizable as NYC, but otherwise showing no data).

setwd("/Users/monirulislam/Desktop/Weylandt HW/Weylandt HW3/nycc_24a")
nycc <- read_sf("nycc.shp")
ggplot(nycc) + geom_sf()

(Q5b) Using the provided nyc demos.csv data file, create a chloropleth map of NYC, where the color variable represents the 2010 under 5-year-old population of each district. Which parts of NYC have the fewest young children? Does this seem right?

setwd("/Users/monirulislam/Desktop/Weylandt HW/Weylandt HW3/nycc_24a")
nyc_demos <- read.csv("nyc_demos.csv")
nycc_demos <- inner_join(nyc_demos,nycc, join_by(district==CounDist))
Under5 <-subset(nycc_demos, variable == 'Population') |> select(-Y2000) |> filter(field == "Under 5 years")
ggplot(Under5) + 
  geom_sf(aes(fill=Y2010, geometry=geometry)) + 
  scale_fill_viridis_b() + 
  ggtitle("Population Under 5 Years By Council District") +
  theme_bw()

Some parts of Manhattan and Queens seem to have the fewest young children as opposed to Brooklyn.

(Q5c) Under the ‘one person one vote’ principle, the number of adult residents in each council district should be roughly equal. Create a chloropleth plot showing how many more/fewer adult residents each district has than the average district. Are any districts (significantly) over/underrepresented on the NY city council?

setwd("/Users/monirulislam/Desktop/Weylandt HW/Weylandt HW3/nycc_24a")
voters <- nycc_demos |> 
          filter(field %in% c("18 years and over")) |>
          select(-Y2000) |>
          arrange(district) |> 
          mutate(avg_voters = (sum(Y2010))/51) |>
          mutate(difference = Y2010 - avg_voters) 
ggplot(voters) + 
  geom_sf(aes(fill=difference, geometry=geometry)) + 
  scale_fill_viridis_b() +
  theme_bw() +
  ggtitle("Difference In Adult Population By Average Voters Per District")

Several districts in Brooklyn, including 40, 41, 44, and 45 are significantly under-represented. Whereas, Manhattan districts including 1, 2, and 3 are significantly over represented.

(Q5d) Create a facet plot where each facet is a chloropleth indicating the percentage of residents in each district identifying as a member of each census-designated racial categories. Note that the nyc demos.csv file contains total counts by race, so you will need to normalize by overall population to get percentages.

setwd("/Users/monirulislam/Desktop/Weylandt HW/Weylandt HW3/nycc_24a")
race <- nycc_demos |>
        select(-Y2000) |>
        filter(field %in% c("White Nonhispanic","Black Nonhispanic","Asian and Pacific Islander Nonhispanic","Other Nonhispanic","Two or More Races Nonhispanic","Hispanic Origin")) |>
        group_by(district) |>
        mutate(percentage=(Y2010/sum(Y2010))*100) |>
        arrange(district)
ggplot(race) +
  geom_sf(aes(fill=percentage, geometry=geometry)) + 
  scale_fill_viridis_b() +
  theme_bw() +
  facet_wrap(~field) +
  theme(strip.text = element_text(size = 7)) +
  ggtitle("Percentage Distribution Of Residents By Race")

(Q5e) Create a visualization to compare the ratio of rental vs owner-owned housing per district with the age demographics of this district. What do you find? Is this what you would have expected?

setwd("/Users/monirulislam/Desktop/Weylandt HW/Weylandt HW3/nycc_24a")
Owner_Renter <- nycc_demos |>
  filter(field %in% c("Householder Age - Owner occupied_15 to 24 years",
                      "Householder Age - Owner occupied_25 to 44 years",
                      "Householder Age - Owner occupied_45 to 64 years",
                      "Householder Age - Owner occupied_65 years and over",
                      "Renter occupied_Male householder, no wife present")) |>
  select(-Y2000)|>
  arrange(district) |>
  group_by(district) |>
  mutate(percentage=(Y2010/sum(Y2010))*100) 
ggplot(Owner_Renter) +
  geom_sf(aes(fill=percentage, geometry=geometry)) + 
  scale_fill_viridis_b() +
  facet_wrap(~field) +
  theme_bw() +
  theme(strip.text = element_text(size = 5)) +
  ggtitle("Percentage Distribution Of Owner And Renter Housing Per District By Age")

Based off the visualization and also off expectations, percentage of home ownership between the ages of 15 to 24 is low across all districts because people at this point are still in school, figuring out their lives, and simply do not have the financial means to own a home. There is a higher percentage of renters in the Brooklyn and Manhattan districts because in general asset prices are significantly higher compared to the other boroughs, which makes renting a better financial option.